import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.models import NumeralTickFormatter
from bokeh.io import output_notebook
output_notebook()
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
import seaborn as sns
#Setup names for our family.
females = ["Sarah"]
males = ["Jason","Leon","Conrad"]
family = females + males
#Create some lists to select or reformat data later
count_cols = [] # "1996 Count", ... "2021 Count" ** Used to drop/rename columns
rank_cols = [] # "1996 Rank", ... "2021 Rank" ** Used to drop/rename columns
years = [] # "1996", ... "2021" ** Used to wrangle data and rename columns
for f in range(1996,2022):
count_cols.append(str(f) + ' Count')
rank_cols.append(str(f) + ' Rank')
years.append(str(f))
#Import ONS data
df1 = pd.read_excel("babynames1996to2021.xlsx","1",skiprows =7) # Boys names in workbook 1
df2 = pd.read_excel( 'babynames1996to2021.xlsx',"2",skiprows =7) # Girls names in workbook 2
df1 = df1.set_index("Name")
df2 = df2.set_index("Name")
#Replace NaN with 0
df1 = df1.replace("[x]",0)
df2 = df2.replace("[x]",0)
#Filter names from ONS data
Girls = df2.loc[females]
Boys = df1.loc[males]
df_family = pd.concat([Girls, Boys], axis=0)
#Drop rank and count columns
df_family_counts = df_family.drop(columns=rank_cols)
df_family_ranks = df_family.drop(columns=count_cols)
#Rename columns "1996 Count" -> "1996" and "1996 Rank" -> "1996"
df_family_counts.columns = df_family_counts.columns.str.replace(" Count","")
df_family_ranks.columns = df_family_ranks.columns.str.replace(" Rank","")
#Pivot DFs so rows are date observations
df_family_counts = pd.pivot_table(df_family_counts, values = years, columns=["Name"])
df_family_counts.index.name = "Year"
df_family_counts = df_family_counts.reindex(columns=family)
df_family_ranks = pd.pivot_table(df_family_ranks, values = years, columns=["Name"])
df_family_ranks.index.name = "Year"
df_family_ranks = df_family_ranks.reindex(columns=family)
#Display the rank table for years since children born
df_family_ranks.tail(14)
| Name | Sarah | Jason | Leon | Conrad |
|---|---|---|---|---|
| Year | ||||
| 2008 | 72 | 139 | 62 | 489 |
| 2009 | 79 | 145 | 60 | 473 |
| 2010 | 83 | 161 | 60 | 536 |
| 2011 | 90 | 159 | 62 | 576 |
| 2012 | 95 | 165 | 72 | 734 |
| 2013 | 96 | 155 | 75 | 611 |
| 2014 | 95 | 162 | 85 | 714 |
| 2015 | 96 | 163 | 85 | 779 |
| 2016 | 96 | 170 | 92 | 870 |
| 2017 | 103 | 179 | 98 | 861 |
| 2018 | 103 | 174 | 97 | 851 |
| 2019 | 107 | 175 | 93 | 1152 |
| 2020 | 115 | 213 | 93 | 921 |
| 2021 | 125 | 210 | 101 | 1448 |
df_family_counts.tail(14)
| Name | Sarah | Jason | Leon | Conrad |
|---|---|---|---|---|
| Year | ||||
| 2008 | 930 | 404 | 1138 | 71 |
| 2009 | 793 | 406 | 1155 | 76 |
| 2010 | 722 | 344 | 1204 | 64 |
| 2011 | 663 | 360 | 1069 | 61 |
| 2012 | 592 | 359 | 918 | 45 |
| 2013 | 574 | 372 | 837 | 56 |
| 2014 | 601 | 353 | 765 | 45 |
| 2015 | 581 | 355 | 795 | 41 |
| 2016 | 572 | 340 | 737 | 36 |
| 2017 | 530 | 314 | 669 | 36 |
| 2018 | 503 | 311 | 655 | 36 |
| 2019 | 479 | 302 | 662 | 23 |
| 2020 | 422 | 240 | 620 | 31 |
| 2021 | 403 | 251 | 590 | 17 |
#Create totals DF
df1_totals = df1.drop(columns=rank_cols).sum()
df2_totals = df2.drop(columns=rank_cols).sum()
df1_totals = df1_totals.reset_index()
df2_totals = df2_totals.reset_index()
df_totals = pd.concat([df1_totals, df2_totals], axis=0)
df_totals = df_totals.replace(count_cols,years)
#df_totals = df1_totals + df2_totals
df_totals.columns=["Years","Total"]
df_totals = df_totals.groupby("Years").sum()
#Calculate accumulative percentage change
df_family_diff = pd.DataFrame(df_family_counts,copy=True)
temp_col = [] # Build a list of temp columns to drop later
for name in family:
df_family_diff[name + " Diff"] = df_family_diff[name].diff()
temp_col.append(name + " Diff")
df_family_diff = df_family_diff.fillna(0)
for name in family:
df_family_diff[name + " %"] = df_family_diff[name + " Diff"] / df_family_diff[name]
df_family_diff[name + " %"] = df_family_diff[name + " %"].cumsum()
# Drop the columns not required for plotting
temp_col += family
df_family_diff = df_family_diff.drop(columns=temp_col)
source = ColumnDataSource(df_totals)
p = figure(title="Total UK baby names", x_axis_label="Year", y_axis_label="Babies",x_range=years, width=1100, height=500)
p.line(source=source, x="Years",y="Total")
show(p)
# create a new plot with a title and axis labels
source = ColumnDataSource(df_family_counts)
p = figure(title="UK baby count for selected names", x_axis_label="", y_axis_label="Babies",x_range=years, width=1100, height=500)
# add a line renderer with legend and line thickness
p.line(x = "Year", y = "Sarah", legend_label="Sarah", line_color="red",line_width=2,source=source)
p.line(x = "Year", y = "Jason", legend_label="Jason", line_width=2,source=source)
p.line(x = "Year", y = "Leon", legend_label="Leon", line_color="yellow",line_width=2,source=source)
p.line(x = "Year", y = "Conrad", legend_label="Conrad", line_color="darkgrey",line_width=2,source=source)
# show the results
show(p)
# create a new plot with a title and axis labels
source = ColumnDataSource(df_family_ranks)
p = figure(title="UK baby name rankings", x_axis_label="Year",y_range=(1500,1), y_axis_label="Name UK rank",x_range=years, width=1100, height=500)
# add a line renderer with legend and line thickness
p.line(x = "Year", y = "Sarah", legend_label="Sarah", line_color="red",line_width=2,source=source)
p.line(x = "Year", y = "Jason", legend_label="Jason", line_width=2,source=source)
p.line(x = "Year", y = "Leon", legend_label="Leon", line_color="yellow",line_width=2,source=source)
p.line(x = "Year", y = "Conrad", legend_label="Conrad", line_color="darkgrey",line_width=2,source=source)
# show the results
show(p)
# create a new plot with a title and axis labels
source = ColumnDataSource(df_family_diff)
p = figure(title="Name Popularity % Change", x_axis_label="Year", y_axis_label="%",x_range=years, width=1100, height=500)
# add a line renderer with legend and line thickness
p.line(x = "Year", y = "Sarah %", legend_label="Sarah", line_color="red",line_width=2,source=source)
p.line(x = "Year", y = "Jason %", legend_label="Jason", line_width=2,source=source)
p.line(x = "Year", y = "Leon %", legend_label="Leon", line_color="yellow",line_width=2,source=source)
p.line(x = "Year", y = "Conrad %", legend_label="Conrad", line_color="darkgrey",line_width=2,source=source)
# show the results
show(p)
df_totals.plot(title="Total UK baby names",figsize=(15,6),grid=True)
<AxesSubplot:title={'center':'Total UK baby names'}, xlabel='Years'>
df_family_counts.plot(title="UK baby count for selected names",figsize=(15,6),grid=True)
<AxesSubplot:title={'center':'UK baby count for selected names'}, xlabel='Year'>
df_family_ranks.plot(title="UK baby name rankings",figsize=(15,6),grid=True)
plt.ylim(1500,1)
(1500.0, 1.0)
df_family_diff.plot(title="Name Popularity % Change",figsize=(15,6),grid=True)
<AxesSubplot:title={'center':'Name Popularity % Change'}, xlabel='Year'>
fig, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(df_totals).set(title="Total UK baby names")
plt.grid(True)
fig, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(df_family_counts).set(title="UK baby count for selected names")
plt.grid(True)
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(df_family_ranks).set(title="UK baby rank for selected names")
plt.ylim(1500,1)
plt.grid(True)
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(df_family_diff).set(title="Name Popularity % Change")
plt.grid(True)
px.line(df_totals, title = "Total UK baby names")
px.line(df_family_counts, title="UK baby count for selected names")
px.line(df_family_ranks, title="UK baby rank for selected names")
fig.update_yaxes(autorange="reversed")
px.line(df_family_diff, title="Name Popularity % Change")